Chroms

t1 <- Sys.time()
EH23a <- read.csv("../FigureSideo/EH23a.softmasked_nuccomp.csv")
EH23a$chrom_num <- sub(".+chr", "", EH23a$Id)
EH23a$chrom_num[ EH23a$chrom_num == "X" ] <- 10
EH23a$chrom_num <- as.numeric(EH23a$chrom_num)
EH23a[1:3, ]
EH23a[, 1:2]

EH23b <- read.csv("../FigureSideo/EH23b.softmasked_nuccomp.csv")
EH23b$chrom_num <- sub(".+chr", "", EH23b$Id)
EH23b$chrom_num[ EH23b$chrom_num == "X" ] <- 10
EH23b$chrom_num <- as.numeric(EH23b$chrom_num)
EH23b[1:3, ]

Syri

syri <- read.table("/media/knausb/E737-9B48/knausb/mm2_maps/EH23a_EH23b/EH23a_EH23brcsyri.out",
                   sep = "\t", na.strings = "-")
names(syri) <- c("ref_chrom", "ref_start", "ref_end", "ref_seq", "query_seq", "query_chrom", "query_start", "query_end", "unique_ID", "parent_ID", "Annotation_Type", "Copy_Status")
#syri$ref_start[is.na(as.numeric(syri$ref_start))]
syri$ref_start <- as.numeric(syri$ref_start)
syri$ref_end <- as.numeric(syri$ref_end)
syri$query_start <- as.numeric(syri$query_start)
syri$query_end <- as.numeric(syri$query_end)

syri[1:3, ]
sort(table(syri$Annotation_Type), decreasing = TRUE)

syn <- syri[syri[, 11] == "SYN", ]
syn[1:3, ]
nrow(syn)
last_win <- 1
#coalesce <- 5e4
#coalesce <- 1e5
coalesce <- 4e5
#coalesce <- 1e6
for( j in 2:nrow(syn) ){
  if( syn$ref_start[j] - syn$ref_end[last_win] < coalesce &
      syn$query_start[j] - syn$query_end[last_win] < coalesce &
      syn$ref_chrom[j] == syn$ref_chrom[last_win] ){
        syn$ref_end[ last_win ] <- syn$ref_end[j]
        syn$query_end[ last_win ] <- syn$query_end[j]
        #syn$ref_end[j] <- NA
        syn$unique_ID[j] <- NA
  } else {
    last_win <- j
  }
}

#syn <- syn[ !is.na( syn$ref_end ), ]
syn <- syn[ !is.na( syn$unique_ID ), ]
syn[1:3, ]
nrow(syn)
min(syn$ref_end - syn$ref_start)
hist(syn$ref_end - syn$ref_start)

sort(syn$ref_end - syn$ref_start, decreasing = T)[1:10]

syn[1:3, ]

inv <- syri[syri$Annotation_Type == "INV", ]

dup <- syri[syri$Annotation_Type == "DUP", ]
dup[1:3, ]
nrow(dup)
trans <- syri[syri$Annotation_Type == "TRANS", ]
trans[1:3, ]
nrow(trans)

gribbon

# nrow(syn)
# x <- syn[1:1000, ]
# nsmooth <- 10
# min_len <- 1e5

#gribbon <- function(x, nsmooth = 50, min_len = 1e5){
gribbon <- function(
    x, 
    nsmooth = 10, 
    min_len = 1e5, 
    coalesce = 0, 
    invert = FALSE){
  
  x <- x[(x$ref_end - x$ref_start) >= min_len, ]
  x <- x[(x$query_end - x$query_start) >= min_len, ]
  if( nrow(x) <= 0 ){
    return("No rows")
  }
  
  # if( coalesce > 0 ){
  #   last_win <- 1
  #   for( j in 2:nrow(x) ){
  #     if( x$ref_start[j] - x$ref_end[last_win] < coalesce &
  #         x$query_start[j] - x$query_end[last_win] < coalesce ){
  #       x$ref_end[ last_win ] <- x$ref_end[j]
  #       x$query_end[ last_win ] <- x$query_end[j]
  #       x$ref_end[j] <- NA
  #     } else {
  #       last_win <- j
  #     }
  #   }
  #   x <- x[ !is.na( x$ref_end ), ]
  # }
  
  if( invert ){
    tmp <- x$query_start
    x$query_start <- x$query_end
    x$query_end <- tmp
  }
  
  x$ref_chrom_num <- sub(".+chr", "", x$ref_chrom)
  x$ref_chrom_num[ x$ref_chrom_num == "X" ] <- 10
  x$ref_chrom_num[ x$ref_chrom_num == "Y" ] <- 11
  x$ref_chrom_num <- as.numeric(x$ref_chrom_num)
  x$ref_chrom_num <- x$ref_chrom_num - 0.2
  x$query_chrom_num <- sub(".+chr", "", x$query_chrom)
  x$query_chrom_num[ x$query_chrom_num == "X" ] <- 10
  x$query_chrom_num[ x$query_chrom_num == "Y" ] <- 11
  x$query_chrom_num <- as.numeric(x$query_chrom_num)
  x$query_chrom_num <- x$query_chrom_num + 0.2
  
#  nrow(x)
  my_x <- seq( from = -4, to = 4, length.out = nsmooth)
  my_y <- 1/( 1 + exp(1)^-my_x)
  my_x <- (my_x + 4)/8
  # plot(my_x, my_y)
  
#  my_x  <- c(my_x, rev(my_x))
  my_polys <- data.frame( 
    ID = rep(x$unique_ID, each = nsmooth * 2),
    ref_chrom_num = rep(x$ref_chrom_num, each = nsmooth * 2),
#    ref_start = rep(NA, each = nsmooth * 2),
#    ref_end = rep(NA, each = nsmooth * 2),
    query_chrom_num = rep(x$query_chrom_num, each = nsmooth * 2),
#    query_start = rep(NA, each = nsmooth * 2),
#    query_end = rep(NA, each = nsmooth * 2)
     poly_xs = rep(NA, each = nsmooth * 2),
     poly_ys = rep(NA, each = nsmooth * 2)
  )
  my_polys[1:3, ]
  
  my_vs <- seq(1, nrow(my_polys) + 1, by = nsmooth * 2)
#  for( i in seq(1, nrow(my_polys), by = nsmooth * 2) ){
  for( i in 1:nrow(x) ){
    my_index <- my_vs[i]:(my_vs[i+1] - 1)
    xmins <- seq( x$ref_chrom_num[i], x$query_chrom_num[i], length.out = nsmooth)
    
#    my_y * (x$query_end[i] - x$query_start[i]) + x$query_start[i]
    
    ymins <- my_y * (x$query_start[i] - x$ref_start[i]) + x$ref_start[i]
    ymaxs <- my_y * (x$query_end[i] - x$ref_end[i]) + x$ref_end[i]
    
    my_polys$poly_xs[my_index]    <- c(xmins, rev(xmins))
    my_polys$poly_ys[my_index] <- c(ymins, rev(ymaxs))
    #x$ref_start[ my_index ] <- seq(x$ref_chrom[i], x$query_chrom[i], length.out = nsmooth)
    
  }
#  my_polys[1:3, ]
  return( my_polys )  
}
#poly_coords <- gribbon(syn[1:1000, ])
#poly_coords <- gribbon(syn, min_len = 1e5)
#
nrow(syn)
## [1] 40
poly_coords <- gribbon(syn, min_len = 1e1, coalesce = 0)

#poly_coords <- gribbon(syn, min_len = 1e3)
# poly_coords <- gribbon(syn, nsmooth = 40, min_len = 1e3)
# poly_coords <- gribbon(syn, min_len = 1e1)
length(unique(poly_coords$ID))
## [1] 40
poly_coords_inv <- gribbon(inv, min_len = 1e4, invert = TRUE)
poly_coords_dup <- gribbon(dup, min_len = 1e4)
poly_coords_trans <- gribbon(trans, min_len = 1e4)

get_wins

cmd <- "~/gits/hempy/bin/fast_win.py /media/knausb/Vining_lab/knausb/mm2_maps/EH23a_EH23b_v2/EH23b.rc458X.softmasked.fasta.gz --win_size 1000000"
#system(cmd)
# sampn <- "EH23b"

get_wins <- function( sampn ){
#  sampd <- "../FigureSideo/EH23a.softmasked_wins.csv"
  wins_dir <- "../FigureSideo/"
  gffs_dir <- "/media/knausb/E737-9B48/releases/scaffolded/"
  
#  sampn <- unlist(lapply(strsplit(sampd, split = "/"), function(x){x[length(x)]}))
#  sampn <- unlist(lapply(strsplit(sampd, split = "//"), function(x){x[length(x)]}))
#  sampn <- sub(".softmasked_wins.csv", "", sampn)
#  sub(paste(sampn, ".softmasked_wins.csv", sep = ""), "", sampd)
  
  if( sampn == "EH23b" ){
    wins <- read.csv( "EH23b.rc458X.softmasked_wins.csv" )
    wins$Id <- sub("EH23a", "EH23b", wins$Id)
  } else {
    wins <- read.csv( paste( wins_dir, sampn, ".softmasked_wins.csv", sep = "") )
  }
#  wins <- read.csv( paste( sampn, ".softmasked_wins.csv", sep = "") )
#  wins <- read.csv( sampd )
#  wins <- wins[ grep( paste(sampn, ".chr", sep = ""), wins$Id ), ]
  wins$chrn <- sub(".+chr", "", wins$Id)
  wins$chrn[ wins$chrn == "X" ] <- 10
  wins$chrn[ wins$chrn == "Y" ] <- 11
  wins$chrn <- as.numeric(wins$chrn)
  #wins[1:3, ]
  
  # Scale and center.
  # Scaline of CG windows (chromosome width) is on a per chromosome basis.
  #  wins$CGs <- 0
  wins$CGs <- wins$CG / wins$Win_length * 100
  #  wins$notCG <- (wins$Win_length / 1) - wins$CG
  #  wins$notCGs <- 0
  for( j in unique(wins$Id) ){
    wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j] - min(wins$CGs[ wins$Id == j], na.rm = TRUE)
    wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j] / max(wins$CGs[ wins$Id == j], na.rm = TRUE)
    #
    # wins$CGs[ wins$Id == j] <- wins$CG[ wins$Id == j] - min(wins$CG[ wins$Id == j], na.rm = TRUE)
    # wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j]/max(wins$CGs[ wins$Id == j], na.rm = TRUE)
    #
    # wins$notCGs[ wins$Id == j] <- wins$notCG[ wins$Id == j] - min(wins$notCG[ wins$Id == j], na.rm = TRUE)
    # wins$notCGs[ wins$Id == j] <- wins$notCGs[ wins$Id == j]/max(wins$notCGs[ wins$Id == j], na.rm = TRUE)
  }
  wins$iCGs <- 1 - wins$CGs
  
  #wins$ATs <- 1 - wins$CGs
  #wins$ATs <- wins$Win_length/1e6 - wins$CGs
  
  # genes <- read.table( 
  #   paste(sampd, "/", sampn, ".primary_high_confidence.gff3.gz", sep = "" ), 
  #   sep = "\t" )
  genes <- read.table( 
    paste(gffs_dir, sampn, "/", sampn, ".primary_high_confidence.gff3.gz", sep = "" ), 
    sep = "\t" )
  genes <- genes[genes[, 3] == "gene", ]
  
  if( sampn == "EH23b" ){
    # Column 4 is starts.
    genes[ genes[, 1] == "EH23b.chr4", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr4" ] - genes[ genes[, 1] == "EH23b.chr4", 4]
    genes[ genes[, 1] == "EH23b.chr4", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr4" ] - genes[ genes[, 1] == "EH23b.chr4", 5]
    
        genes[ genes[, 1] == "EH23b.chr5", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr5" ] - genes[ genes[, 1] == "EH23b.chr5", 4]
    genes[ genes[, 1] == "EH23b.chr5", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr5" ] - genes[ genes[, 1] == "EH23b.chr5", 5]
    
        genes[ genes[, 1] == "EH23b.chr8", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr8" ] - genes[ genes[, 1] == "EH23b.chr8", 4]
    genes[ genes[, 1] == "EH23b.chr8", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr8" ] - genes[ genes[, 1] == "EH23b.chr8", 5]
    
        genes[ genes[, 1] == "EH23b.chrX", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chrX" ] - genes[ genes[, 1] == "EH23b.chrX", 4]
    genes[ genes[, 1] == "EH23b.chrX", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chrX" ] - genes[ genes[, 1] == "EH23b.chrX", 5]
    
  }
  
  genes[1:3, ]
  
  
  # Windowize
  wins$gcnt <- 0
  for(i in 1:nrow(wins)){
    # if( sampn == "EH23b" ){
    #   wins$Id <- sub("EH23a", "EH23b", wins$Id)
    # }
    tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
    wins$gcnt[i] <- nrow(tmp)
  }
  
  # Scaling of gene count windows is on a per assembly basis.
  wins$gcntsc <- wins$gcnt - min(wins$gcnt)
  wins$gcntsc <- wins$gcntsc / max(wins$gcntsc)
  my_index <- round( wins$gcntsc * 100 )
  my_index[ my_index <= 0] <- 1
  
  # Color ramp.
  #wins$gcol <- heat.colors(n=100)[ my_index ]
  #wins$gcol <- colorRampPalette(c("yellow", "orange", "red"))( 100 )[ my_index ]
  #wins$gcol <- colorRampPalette(c("red", "orange", "yellow"))( 100 )[ my_index ]
  #wins$gcol <- colorRampPalette(c("#0000FF", "#228B22", "#A0522D"))( 100 )[ my_index ]
  #wins$gcol <- colorRampPalette(c("#87CEEB", "#3CB371", "#228B22", "#A0522D"))( 100 )[ my_index ]
  
  #wins$gcol <- viridisLite::plasma(n = 100, alpha = 1, begin = 0.1, end = 1)[ my_index ]
  wins$gcol <- viridisLite::magma(n = 100, alpha = 1, begin = 0.2, end = 1.00)[ my_index ]
  
  return(wins)
}
EH23a_wins <- get_wins("EH23a")
EH23a_wins[1:3, ]
EH23b_wins <- get_wins("EH23b")
EH23b_wins[1:3, ]

Windowize SNPs

EH23a_wins[1:3, ]
##           Id Win_number   Start     End Win_length      A      C      G      T
## 1 EH23a.chr1          0       0  999999    1000000 339510 159349 157789 343352
## 2 EH23a.chr1          1 1000000 1999999    1000000 346184 152100 152139 349577
## 3 EH23a.chr1          2 2000000 2999999    1000000 343278 155236 154692 346794
##      CG   CHG   CHH chrn        CGs      iCGs gcnt    gcntsc      gcol
## 1 14003 19695 90544    1 0.07933312 0.9206669  156 1.0000000 #FCFDBFFF
## 2 13018 18441 88693    1 0.00000000 1.0000000  141 0.9038462 #FED799FF
## 3 14100 19606 89229    1 0.08714562 0.9128544  123 0.7884615 #FEAD77FF
snps <- syri[syri$Annotation_Type == "SNP", ]
nrow(snps)
## [1] 3660371
snps[1:3, ]
##     ref_chrom ref_start ref_end ref_seq query_seq query_chrom query_start
## 5  EH23a.chr1      6376    6376       c         t  EH23a.chr1       43110
## 8  EH23a.chr1      6542    6542       A         G  EH23a.chr1       43276
## 11 EH23a.chr1      6779    6779       t         c  EH23a.chr1       43514
##    query_end unique_ID parent_ID Annotation_Type Copy_Status
## 5      43110  SNP21930      SYN1             SNP        <NA>
## 8      43276  SNP21933      SYN1             SNP        <NA>
## 11     43514  SNP21936      SYN1             SNP        <NA>
EH23a_wins$snps <- 0

for(i in 1:nrow(EH23a_wins)){
  tmp <- snps[ snps$ref_chrom == EH23a_wins$Id[i] & snps$ref_start >= EH23a_wins$Start[i] & snps$ref_end <= EH23a_wins$End[i], ]
  EH23a_wins$snps[i] <- nrow(tmp)
}
EH23a_wins$snpssc <- (EH23a_wins$snps - min(EH23a_wins$snps))/max(EH23a_wins$snps)

hist(EH23a_wins$snps)

hist(EH23a_wins$snps[ EH23a_wins$Id == "EH23a.chr4" ])

my_hist <- hist(EH23a_wins$snpssc * 9 + 1, plot = F)

#barplot(my_hist$density, space = 0, col = gray.colors(n=10))
#axis(side=1)
barplot(my_hist$density ~ my_hist$breaks[-10], space = 0, col = gray.colors(n=10))

BLAST

sampn <- "EH23a"
ablstn <- read.csv(
    paste("/media/knausb/Vining_lab/knausb/blast_projects/todd_rrna/blastn_uc/", sampn, "_blastn.csv", sep = ""),
    header = FALSE)
colnames(ablstn) <- c("qseqid","qlen","sseqid","slen","qstart","qend",
                     "sstart","send","evalue","bitscore","score","length",
                     "pident","nident","mismatch","positive","gapopen",
                     "gaps","ppos","sstrand")
ablstn <- ablstn[grep("chr", ablstn$sseqid), ]
ablstn$chrn <- sub(".*chr", "", ablstn$sseqid)
ablstn$chrn[ablstn$chrn == "X"] <- 10
ablstn$chrn[ablstn$chrn == "Y"] <- 11
ablstn$chrn <- as.numeric(ablstn$chrn)


sampn <- "EH23b"
bblstn <- read.csv(
    paste("/media/knausb/Vining_lab/knausb/blast_projects/todd_rrna/blastn_uc/", sampn, "_blastn.csv", sep = ""),
    header = FALSE)
colnames(bblstn) <- c("qseqid","qlen","sseqid","slen","qstart","qend",
                     "sstart","send","evalue","bitscore","score","length",
                     "pident","nident","mismatch","positive","gapopen",
                     "gaps","ppos","sstrand")
bblstn <- bblstn[grep("chr", bblstn$sseqid), ]
bblstn$chrn <- sub(".*chr", "", bblstn$sseqid)
bblstn$chrn[bblstn$chrn == "X"] <- 10
bblstn$chrn[bblstn$chrn == "Y"] <- 11
bblstn$chrn <- as.numeric(bblstn$chrn)

my_chrom <- "EH23b.chr4"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chr5"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]

bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chr8"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chrX"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]

sampn <- "EH23b"

Plot

library(ggplot2)

chrom_wid <- 0.05
p <- ggplot2::ggplot()
p <- p + theme_bw()
p <- p + ggplot2::geom_rect( 
  data = EH23a, 
  ggplot2::aes( xmin = chrom_num - chrom_wid - 0.2,
                xmax = chrom_num + chrom_wid - 0.2,
                ymin = 1, ymax = Length),
  fill = "#DCDCDC",
  color = "#000000"
)

p <- p + ggplot2::geom_rect( 
  data = EH23b, 
  ggplot2::aes( xmin = chrom_num - chrom_wid + 0.2,
                xmax = chrom_num + chrom_wid + 0.2,
                ymin = 1, ymax = Length),
  fill = "#DCDCDC",
  color = "#000000"
)

# EH23a_wins[1:3, ]

my_cols <- gray.colors(n=10, start = 0.3, end = 0.9)[round(EH23a_wins$snpssc * 9 + 1)]
#my_cols <- gray.colors(n=100, start = 0.3, end = 0.9)[round(EH23a_wins$snpssc * 99 + 1)]
#p <- 
p + ggplot2::geom_rect( 
  data = EH23a_wins, 
  ggplot2::aes( xmin = chrn - 0.5,
                xmax = chrn - 0.25,
                ymin = Start, ymax = End),
#  fill = "#DCDCDC",
  fill = my_cols,
#  color = "#000000"
)

# Theme
p <- p + ggplot2::scale_x_continuous( 
  breaks = EH23a$chrom_num,
#  limits = c(0.6, 10.4),
#  limits = c(0.5, 10.5),
#  labels = EH23a$Id
  labels = sub("a.chr", ".chr", EH23a$Id)
)
  
p <- p + scale_y_continuous(
  breaks = seq( 0, 120e6, by = 10e6), 
  labels = seq( 0, 120, by = 10)
)
  
#  p <- p + ggplot2::theme_bw() + 
p <- p + ggplot2::theme( 
      #      panel.grid.minor.x = ggplot2::element_blank(), 
    panel.grid.minor.x = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 ),
    axis.text.x = element_text(angle = 60, hjust = 1),
    axis.title.x=element_blank(),
    panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
    panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
  )
  
p <- p + ggplot2::ylab("Position (Mbp)")
p <- p + ggtitle( "EH23" )

p

#p <- p + theme(legend.position='none')
p <- p + geom_polygon( 
  data = poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID), #, fill = col),
  alpha = 2/5,
  show.legend = FALSE,
  #color = NA,
#  color = "#80808022",
#  color = "#000000",
  color = "#696969",
#  color = "#808080",
#  color = "#A9A9A9",
#  color = "#C0C0C0",
  linewidth = 0.4,
#            fill = ID
#  fill = "#C0C0C044"
  fill = "#808080"
  )

p

p <- p + geom_polygon( 
  data = poly_coords_inv, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 0.9,
  show.legend = FALSE,
  #color = NA,
#  color = "#80808022",
  #color = "#000000",
#  linewidth = 0.1,
#            fill = ID
#  fill = "#C0C0C044"
  fill = "#FFA500"
#  fill = "#808080"
  )
p

# p + geom_polygon( 
#   data = poly_coords_dup, 
#   aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
#   alpha = 0.9,
#   show.legend = FALSE,
#   fill = "#1E90FF"
#   )

# p + geom_polygon( 
#   data = poly_coords_trans, 
#   aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
#   alpha = 0.9,
#   show.legend = FALSE,
#   fill = "#1E90FF"
#   )


thinw <- 0.28
p <- p + ggplot2::geom_rect( 
  data = EH23a_wins, 
  ggplot2::aes(
    xmin = chrn - iCGs * thinw - 0.2,
    #xmax = chrn + iCGs * thinw,
    xmax = chrn - 0.2,
    ymin = Start, 
    ymax = End),
  fill = EH23a_wins$gcol,
  color = NA
)

p <- p + ggplot2::geom_rect( 
  data = EH23b_wins, 
  ggplot2::aes(
    #xmin = chrn - iCGs * thinw - 0.2,
    xmin = chrn + 0.2,
    xmax = chrn + iCGs * thinw + 0.2,
    #xmax = chrn - 0.2,
    ymin = Start, 
    ymax = End),
  fill = EH23b_wins$gcol,
  color = NA
)
p

#p + xlim(3.5, 4.5) #+ ylim(1e6, 2e6)

p + xlim(5.5, 8.5) + ylim(10e6, 40e6)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 10 rows containing missing values (`geom_rect()`).
## Removed 10 rows containing missing values (`geom_rect()`).
## Warning: Removed 654 rows containing missing values (`geom_rect()`).
## Warning: Removed 653 rows containing missing values (`geom_rect()`).

#pEH23 <- p
#save(pEH23, file = "ideo.RData")
#load("ideo.RData")
#pEH23
# Cannabinoids
my_rows <- grep("AB2|LY|NC_044376.1:c427", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
    data = ablstn[my_rows, ], 
    ggplot2::aes( 
      xmin = chrn - blstw,
      xmax = chrn + 0 - 0.14,
      ymin = sstart, 
      ymax = send),
    fill = "#228B22",
    color = '#228B22'
  )
my_rows <- grep("AB2|LY|NC_044376.1:c427", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
    data = bblstn[my_rows, ], 
    ggplot2::aes( 
      xmin = chrn + 0.14,
      xmax = chrn + blstw,
      ymin = sstart, 
      ymax = send),
    fill = "#228B22",
    color = '#228B22'
  )

# 5S
my_rows <- grep("5S_", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = ablstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn - blstw,
    xmax = chrn - 0.1,
    ymin = sstart, 
    ymax = send),
  fill = "#000000",
  color = '#000000'
)
my_rows <- grep("5S_", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = bblstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn + 0.1,
    xmax = chrn + blstw,
    ymin = sstart, 
    ymax = send),
  fill = "#000000",
  color = '#000000'
)

# 45S
my_rows <- grep("45s_|26s_|5.8s_|18s_", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = ablstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn - blstw,
    xmax = chrn - 0.1,
    ymin = sstart, 
    ymax = send),
  fill = "#B22222",
  linewidth = 2,
  #    linewidth = 4,
  color = '#B22222'
)
my_rows <- grep("45s_|26s_|5.8s_|18s_", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = bblstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn + 0.1,
    xmax = chrn + blstw,
    ymin = sstart, 
    ymax = send),
  fill = "#B22222",
  linewidth = 2,
  #    linewidth = 4,
  color = '#B22222'
)

# Centromeres, subtelomeric repeats
my_rows <- grep("AH3Ma_CS-237_satelite|CsatSD_centromere_237bp|CsatSD_centromere_370bp", ablstn$qseqid)
#  blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = ablstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn - 0.5,
    xmax = chrn - 0.2,
    ymin = sstart, 
    ymax = send),
  fill = "#1E90FF",
  color = '#1E90FF'
)
my_rows <- grep("AH3Ma_CS-237_satelite|CsatSD_centromere_237bp|CsatSD_centromere_370bp", bblstn$qseqid)
p <- p + ggplot2::geom_rect( 
  data = bblstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn + 0.2,
    xmax = chrn + 0.5,
    ymin = sstart, 
    ymax = send),
  fill = "#1E90FF",
  color = '#1E90FF'
)
pEH23 <- p
pEH23

VCF

library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.14.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
#vcf <- read.vcfR("F2s.variants5.freebayes.filt.3.vcf.gz", nrows = )
#vcf <- read.vcfR("F2s_0NA.vcf.gz", nrows = )
vcf <- read.vcfR("/media/knausb/Vining_lab/private_data/salk/VCFs/F2s_0NA.vcf.gz", nrows = )
t10 <- Sys.time()
vcf
## ***** Object of Class vcfR *****
## 271 samples
## 10 CHROMs
## 35,074 variants
## Object size: 462.4 Mb
## 0 percent missing data
## *****        *****         *****
#vcf <- vcf[is.biallelic(vcf), ]
#vcf
vcf@gt <- vcf@gt[, grep("Parent_23", colnames(vcf@gt), invert  = TRUE)]
#colnames(vcf@gt)
vcf
## ***** Object of Class vcfR *****
## 270 samples
## 10 CHROMs
## 35,074 variants
## Object size: 458.4 Mb
## 0 percent missing data
## *****        *****         *****
vcf@fix[1:3, 1:8]
##      CHROM    POS      ID REF ALT QUAL      FILTER INFO
## [1,] "chr_1e" "62350"  NA "G" "A" "246.804" "PASS" NA  
## [2,] "chr_1e" "98473"  NA "C" "T" "2122.13" "PASS" NA  
## [3,] "chr_1e" "204226" NA "C" "G" "61311.8" "PASS" NA

Nucs

#
nucs2 <- read.csv("~/gits/CannabisPangenome/FigureSideo/EH23b.softmasked_nuccomp.csv")
#nucs <- read.csv("~/gits/CannabisPangenome/FigureSideo/EH23b.softmasked_nuccomp.csv")
nucs <- read.csv("/media/knausb/Vining_lab/private_data/salk/ERBxHO40_23_v2_hap_ERB/nuccomp/ERBxHO40_23.combined.v2.hic.hap_ERB_nuccomp.csv")

cbind(nucs2$Id, nucs$Id)
##       [,1]         [,2]    
##  [1,] "EH23b.chr1" "chr_1e"
##  [2,] "EH23b.chr5" "chr_2e"
##  [3,] "EH23b.chr2" "chr_3e"
##  [4,] "EH23b.chr3" "chr_4e"
##  [5,] "EH23b.chr4" "chr_5e"
##  [6,] "EH23b.chr7" "chr_6e"
##  [7,] "EH23b.chr8" "chr_7e"
##  [8,] "EH23b.chr9" "chr_8e"
##  [9,] "EH23b.chr6" "chr_9e"
## [10,] "EH23b.chrX" "chr_Xe"

Adjust chromosome names

vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_1e" ] <- "chr1"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_2e" ] <- "chr5"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_3e" ] <- "chr2"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_4e" ] <- "chr3"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_5e" ] <- "chr4"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_6e" ] <- "chr7"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_7e" ] <- "chr8"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_8e" ] <- "chr9"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_9e" ] <- "chr6"
vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "chr_Xe" ] <- "chrX"
#vcf@fix[, "CHROM"] <- sub("EH23b.", "", vcf@fix[, "CHROM"])
unique(vcf@fix[, "CHROM"])
##  [1] "chr1" "chr5" "chr2" "chr3" "chr4" "chr7" "chr8" "chr9" "chr6" "chrX"
nucs$Id[ nucs$Id == "chr_1e" ] <- "chr1"
nucs$Id[ nucs$Id == "chr_2e" ] <- "chr5"
nucs$Id[ nucs$Id == "chr_3e" ] <- "chr2"
nucs$Id[ nucs$Id == "chr_4e" ] <- "chr3"
nucs$Id[ nucs$Id == "chr_5e" ] <- "chr4"
nucs$Id[ nucs$Id == "chr_6e" ] <- "chr7"
nucs$Id[ nucs$Id == "chr_7e" ] <- "chr8"
nucs$Id[ nucs$Id == "chr_8e" ] <- "chr9"
nucs$Id[ nucs$Id == "chr_9e" ] <- "chr6"
nucs$Id[ nucs$Id == "chr_Xe" ] <- "chrX"

nucs$chrom_num <- sub("^chr", "", nucs$Id)
nucs$chrom_num[ nucs$chrom_num == "X" ] <- 10
nucs$chrom_num <- as.numeric(nucs$chrom_num)

nucs <- nucs[sort.int(nucs$chrom_num, decreasing = FALSE, index.return = TRUE)$ix, ]
nucs$Id
##  [1] "chr1" "chr2" "chr3" "chr4" "chr5" "chr6" "chr7" "chr8" "chr9" "chrX"

Sort VCF

vcf2 <- vcf[ vcf@fix[, "CHROM"] == "chr1", ]
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr2", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr3", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr4", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr5", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr6", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr7", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr8", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr9", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chrX", ])
vcf <- vcf2
rm(vcf2)
nucs[1:3, ]
##     Id   Length a        A c        C g        G t        T w W s S m M k K r R
## 1 chr1 63722626 0 21262072 0 10570136 0 10558163 0 21323755 0 0 0 0 0 0 0 0 0 0
## 3 chr2 80157000 0 26408690 0 13765866 0 13770753 0 26207691 0 0 0 0 0 0 0 0 0 0
## 4 chr3 83805000 0 27490094 0 14453690 0 14424197 0 27432019 0 0 0 0 0 0 0 0 0 0
##   y Y n    N chrom_num
## 1 0 0 0 8500         1
## 3 0 0 0 4000         2
## 4 0 0 0 5000         3
nucs$Id
##  [1] "chr1" "chr2" "chr3" "chr4" "chr5" "chr6" "chr7" "chr8" "chr9" "chrX"
nucs$ends <- 0
nucs$ends[1] <- nucs$Length[1]
for(i in 2:nrow(nucs)){
  nucs$ends[i] <- nucs$ends[i-1] + nucs$Length[i]
}
nucs$mids <- nucs$ends - nucs$Length/2
nucs[1:3, ]
##     Id   Length a        A c        C g        G t        T w W s S m M k K r R
## 1 chr1 63722626 0 21262072 0 10570136 0 10558163 0 21323755 0 0 0 0 0 0 0 0 0 0
## 3 chr2 80157000 0 26408690 0 13765866 0 13770753 0 26207691 0 0 0 0 0 0 0 0 0 0
## 4 chr3 83805000 0 27490094 0 14453690 0 14424197 0 27432019 0 0 0 0 0 0 0 0 0 0
##   y Y n    N chrom_num      ends      mids
## 1 0 0 0 8500         1  63722626  31861313
## 3 0 0 0 4000         2 143879626 103801126
## 4 0 0 0 5000         3 227684626 185782126
my_pos <- rePOS(vcf, lens = nucs)
my_popsum <- gt.to.popsum(vcf)
#my_popsum <- as.matrix(my_popsum)
my_popsum[1:3, ]
##     n Allele_counts        He       Ne
## 1 270        509,31 0.1082236 1.121357
## 2 270       412,128 0.3617010 1.566664
## 3 270       263,277 0.4996639 1.998657
my_popsum <- cbind(vcf@fix[ , 1:2 ], my_popsum)
my_popsum[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne
## 1  chr1  62350 270        509,31 0.1082236 1.121357
## 2  chr1  98473 270       412,128 0.3617010 1.566664
## 3  chr1 204226 270       263,277 0.4996639 1.998657
class(my_popsum)
## [1] "data.frame"
gt <- extract.gt(vcf)
t(apply(gt[1:4, ], MARGIN = 1, function(x){ table(x, useNA = 'always') }))
##             0/0 0/1 1/1 <NA>
## chr1_62350  240  29   1    0
## chr1_98473  145 122   3    0
## chr1_204226  65 133  72    0
## chr1_222275  63 144  63    0
nrow(gt) * 4
## [1] 140296
#, levels = c("0/0", "0/1", "1/1"))

#tmp <- apply(gt, MARGIN = 1, function(x){ table(x, useNA = 'always') })
tmp <- apply(gt, MARGIN = 1, function(x){ 
  table(factor(x, levels = c("0/0", "0/1", "1/1", NA), exclude = NULL)) 
  })

length(tmp)
## [1] 140296
range(lapply(tmp, length))
## [1] 1 1
tmp <- matrix(tmp, ncol = 4, byrow = TRUE)
colnames(tmp) <- c("count0.0", "count0.1", "count1.1", "countNA")
tmp[1:5, ]
##      count0.0 count0.1 count1.1 countNA
## [1,]      240       29        1       0
## [2,]      145      122        3       0
## [3,]       65      133       72       0
## [4,]       63      144       63       0
## [5,]       61      141       68       0
my_popsum <- cbind(my_popsum, tmp)
my_popsum[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1  chr1  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2  chr1  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA
## 1       0
## 2       0
## 3       0
tmp <- matrix(unlist(strsplit(my_popsum$Allele_counts, split = ",")), ncol = 2, byrow = TRUE)
colnames(tmp) <- c("ERBcount", "HO40count")

my_popsum <- cbind(my_popsum, tmp)
my_popsum$ERBcount <- as.numeric(my_popsum$ERBcount)
my_popsum$HO40count <- as.numeric(my_popsum$HO40count)

my_popsum[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1  chr1  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2  chr1  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count
## 1       0      509        31
## 2       0      412       128
## 3       0      263       277
class(my_popsum)
## [1] "data.frame"
my_popsum$ERBfreq <- my_popsum$ERBcount/(my_popsum[, "n"] * 2)
my_popsum$HO40freq <- my_popsum$HO40count/(my_popsum[, "n"] * 2)
my_popsum[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1  chr1  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2  chr1  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq
## 1       0      509        31 0.9425926 0.05740741
## 2       0      412       128 0.7629630 0.23703704
## 3       0      263       277 0.4870370 0.51296296
my_popsum$He2 <- 2 * my_popsum$ERBfreq * my_popsum$HO40freq
my_popsum$POS2 <- my_pos
my_popsum[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1  chr1  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2  chr1  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226
# write.table(x = my_popsum, file = "allele_freqs.csv", sep = ",",
#             row.names = FALSE, col.names = TRUE)

FIS

Hs <- 2 * my_popsum$ERBfreq * my_popsum$HO40freq
Ho <- my_popsum$count0.1 / my_popsum$n
my_popsum$Fis <- (Hs - Ho)/Hs
my_popsum[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1  chr1  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2  chr1  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2          Fis
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350  0.007541669
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473 -0.249241505
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226  0.014152174

Thin

#my_popsum <- my_popsum[seq(1, nrow(my_popsum), by = 2), ]
# hist(my_popsum$ERBfreq)
# hist(my_popsum$HO40freq)

#my_popsum <- my_popsum[my_popsum$ERBfreq >= 0.4, ]
#my_popsum <- my_popsum[my_popsum$ERBfreq <= 0.6, ]
#
#my_popsum <- my_popsum[my_popsum$HO40freq >= 0.4, ]
#my_popsum <- my_popsum[my_popsum$HO40freq <= 0.6, ]

my_min <- 0.3
my_max <- 0.7
my_popsum <- my_popsum[my_popsum$ERBfreq >= my_min, ]
my_popsum <- my_popsum[my_popsum$ERBfreq <= my_max, ]
#
my_popsum <- my_popsum[my_popsum$HO40freq >= my_min, ]
my_popsum <- my_popsum[my_popsum$HO40freq <= my_max, ]

my_min <- -0.5
my_max <- 0.5
my_popsum <- my_popsum[my_popsum$Fis >= my_min, ]
my_popsum <- my_popsum[my_popsum$Fis <= my_max, ]


par( mfrow = c(1, 2) )
#
hist(my_popsum$HO40freq, xlab = "", ylab = "Count", main = "HO40 allele frequency")
#
hist(my_popsum$ERBfreq, xlab = "", ylab = "Count", main = "ERB allele frequency")

par( mfrow = c(1, 1) )
barplot( table( my_popsum$CHROM ), las = 3 )
title( ylab = "Variant count" )
title( main = paste("Total Variant count:", format( nrow(my_popsum), big.mark = ",") ) )

Write VCF

my_popsum[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
## 4  chr1 222275 270       270,270 0.5000000 2.000000       63      144       63
## 5  chr1 249752 270       263,277 0.4996639 1.998657       61      141       68
##   countNA ERBcount HO40count  ERBfreq HO40freq       He2   POS2         Fis
## 3       0      263       277 0.487037 0.512963 0.4996639 204226  0.01415217
## 4       0      270       270 0.500000 0.500000 0.5000000 222275 -0.06666667
## 5       0      263       277 0.487037 0.512963 0.4996639 249752 -0.04514694
nrow(my_popsum)
## [1] 22567
vcf@fix[1:3, 1:8]
##      CHROM  POS      ID REF ALT QUAL      FILTER INFO
## [1,] "chr1" "62350"  NA "G" "A" "246.804" "PASS" NA  
## [2,] "chr1" "98473"  NA "C" "T" "2122.13" "PASS" NA  
## [3,] "chr1" "204226" NA "C" "G" "61311.8" "PASS" NA
nrow(vcf)
## [1] 35074
tmp1 <- paste(getCHROM(vcf), getPOS(vcf), sep = "_")
tmp2 <- paste( my_popsum$CHROM, my_popsum$POS, sep = "_")

vcf <- vcf[tmp1 %in% tmp2, ]
vcf
## ***** Object of Class vcfR *****
## 270 samples
## 10 CHROMs
## 22,567 variants
## Object size: 260.4 Mb
## 0 percent missing data
## *****        *****         *****
# write.vcf(vcf, file = "tmsalk.vcf.gz")

Long form

afreq <- my_popsum

EH23a <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23a$Frequency <- EH23a$ERBfreq
EH23a$facet1 <- "EH23a"

EH23b <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23b$Frequency <- EH23a$HO40freq
EH23b$facet1 <- "EH23b"

Het <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
Het$Frequency <- EH23a$He2
Het$facet1 <- "Het"
Het[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
## 4  chr1 222275 270       270,270 0.5000000 2.000000       63      144       63
## 5  chr1 249752 270       263,277 0.4996639 1.998657       61      141       68
##   countNA ERBcount HO40count  ERBfreq HO40freq       He2   POS2 Frequency
## 3       0      263       277 0.487037 0.512963 0.4996639 204226 0.4996639
## 4       0      270       270 0.500000 0.500000 0.5000000 222275 0.5000000
## 5       0      263       277 0.487037 0.512963 0.4996639 249752 0.4996639
##   facet1
## 3    Het
## 4    Het
## 5    Het
Fis_df <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
#Fis_df$Frequency <- Fis
Fis_df$Frequency <- afreq$Fis
Fis_df$facet1 <- "Fis"
#
afreq <- rbind(EH23a, EH23b, Het, Fis_df)
#afreq <- rbind(EH23a, EH23b, Het)
afreq$facet2 <- afreq$facet1
afreq$facet2 <- sub("EH23[ab]", "EH23", afreq$facet2)
afreq$facet2[ afreq$facet2 == "EH23" ] <- "Allele"
afreq$facet2[ afreq$facet2 == "Het" ] <- "Heterozygosity"
#table(afreq$facet1)
table(afreq$facet2)
## 
##         Allele            Fis Heterozygosity 
##          45134          22567          22567
afreq[1:3, ]
##   CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 3  chr1 204226 270       263,277 0.4996639 1.998657       65      133       72
## 4  chr1 222275 270       270,270 0.5000000 2.000000       63      144       63
## 5  chr1 249752 270       263,277 0.4996639 1.998657       61      141       68
##   countNA ERBcount HO40count  ERBfreq HO40freq       He2   POS2 Frequency
## 3       0      263       277 0.487037 0.512963 0.4996639 204226  0.487037
## 4       0      270       270 0.500000 0.500000 0.5000000 222275  0.500000
## 5       0      263       277 0.487037 0.512963 0.4996639 249752  0.487037
##   facet1 facet2
## 3  EH23a Allele
## 4  EH23a Allele
## 5  EH23a Allele

Plot

library(ggplot2)

table(afreq$facet2)
## 
##         Allele            Fis Heterozygosity 
##          45134          22567          22567
#
afreq <- afreq[afreq$facet2 != "Heterozygosity", ]


#my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.01)
my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.04)
set.seed(99)
my_pal <- my_pal[sample(1:length(my_pal), size = length(my_pal))]
#palette(my_pal)

#as.numeric(as.factor(afreq$CHROM))

#afreq$col2 <- viridisLite::magma( n = 10, alpha = 0.8, begin = 0, end = 1 )[as.numeric(as.factor(afreq$CHROM))]

# range(afreq$POS2)

p <- ggplot( data = afreq, 
             mapping = aes( x = POS2, 
                            #y = ERBfreq,
                            #y = freq,
                            y = Frequency,
                            color = CHROM )
                            #color = col2)
             )
p <- p + geom_point( shape = 20, show.legend = FALSE )
p <- p + theme_bw()
p <- p + theme( axis.title.x=element_blank(),
                axis.text.x = element_text(angle = 60, hjust = 1)
#                plot.margin = unit(c(0.1,0.1,1,0.1), "cm")
                )
#p <- p + theme(axis.title.x = element_blank(), 
#               axis.text.x = element_text( angle = 60, hjust = 1))
# p + scale_x_discrete(breaks = EH23a$mids2, labels = EH23a$Id)
# p + scale_x_discrete(breaks = EH23a$mids2)
p <- p + ggplot2::scale_x_continuous( 
  breaks = nucs$mids,
  expand = expansion(mult = 0.01, add = 0.0),
#  expand = expansion(mult = 0.02, add = 0.0),
  labels = sub("a.chr", ".chr", nucs$Id)
)
p <- p + theme( panel.grid.major.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )
p <- p + theme( panel.grid.minor.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )

# p <- p + geom_vline( xintercept = nuccomp$Length2, linetype = 1, color = "#808080", linewidth = 0.4 )
# p <- p + geom_vline( xintercept = 0, linetype = 1, color = "#808080", linewidth = 0.4 )
#p <- p + geom_vline( xintercept = nuccomp$Length2, linetype = 5, color = "#808080", linewidth = 0.4 )
p <- p + geom_vline( xintercept = nucs$ends, linetype = 5, color = "#808080", linewidth = 0.4 )

p <- p + geom_vline( xintercept = 0, linetype = 5, color = "#808080", linewidth = 0.4 )

#p <- p + geom_hline( yintercept = 0.5, linetype = 1, color = "#000000", linewidth = 1 )
#p <- p + geom_hline( yintercept = c(0, 0.5), linetype = 1, color = "#000000", linewidth = 1 )


#p + scale_color_manual( values = viridisLite::magma( n = 10, alpha = 0.01, begin = 0.2, end = 0.9 ) )
#p <- p + xlim(100, sum(nuccomp$Length))

#p <- p + ylim(0.3, 0.7)
p <- p + scale_color_manual( values = my_pal )
#
#p <- 
p + facet_grid(facet1 ~ ., scales = "free_y")

#p <- 
#ahplot + ylab("")

# ahplot <- ahplot + scale_y_continuous( breaks = seq(-1.0, 1.0, by = 0.2),
#                   minor_breaks = seq(-1, 1, by = 0.1) )

ahplot <- p + facet_grid(facet2 ~ . , scales = "free_y")
#ahplot <-   ahplot + theme( panel.grid.major.y = element_line(color = "#696969", linewidth = 0.6 ) )
ahplot <- ahplot + theme( panel.grid.major.y = element_line(color = "#A9A9A9", linewidth = 0.4 ) )
#ahplot + theme( panel.grid.major.y = element_line(color = "#C0C0C0", linewidth = 0.2, linetype = 1 ) )

#  myPlots[[i]] <- myPlots[[i]] + theme( panel.grid.minor.y=element_line(color = "#C0C0C0", size=0.2) )
ahplot <- ahplot + ylab(NULL)
#p
ahplot

Composite plot

library("ggpubr")

ggarrange(
  plotlist = list(pEH23, ahplot),
  labels = c("A", "B"),
  ncol = 1, nrow = 2, 
  widths = 1, heights = c(2, 1))
**Figure 1. Genomic architecture of the** ***Cannabis sativa*** **genome.**

Figure 1. Genomic architecture of the Cannabis sativa genome.

# ggsave( filename = "EH23_chroms_freqs.tiff", 
#         device = "tiff", 
#         width = 6.5, height = 9, units = "in", 
#         dpi = 300,
#         compression = "lzw")
t99 <- Sys.time()
t99 - t1
## Time difference of 4.447549 mins